require(plyr)
require(dplyr)

setwd("...")

load("./data/predictionsAndTopics.RData")

#### Table 1: Original terms used with thesaurus to create dictionaries ####

# Infantilization
countsINF = synCounts$INF |> dplyr::select(-c("entryID", "presAdmin", "Date"))
ncol(countsINF); sum(countsINF)

# Animal analogies
countsANIM = synCounts$ANIM |> dplyr::select(-c("entryID", "presAdmin", "Date"))
ncol(countsANIM); sum(countsANIM)

# Belligerence
countsBELLIG = synCounts$BELLIG |> dplyr::select(-c("entryID", "presAdmin", "Date"))
ncol(countsBELLIG); sum(countsBELLIG)

# Irrationality
countsIRR = synCounts$IRR |> dplyr::select(-c("entryID", "presAdmin", "Date"))
ncol(countsIRR); sum(countsIRR)



##### Find best models according to kappa #####

# Identify which datasets made the best results for each model
themeMetricData = lapply(allKappaMatrix, apply, 2, which.max) # Identify which dataset did the best
themeMetricData = do.call(rbind.data.frame, themeMetricData)
names(themeMetricData) = themes
row.names(themeMetricData) = models

# themeMetricData finds the indices for the data; now, identify the name of the data
bestMetricData = as.data.frame(matrix(dsets[unlist(themeMetricData)], 
                                      ncol=ncol(themeMetricData), nrow=nrow(themeMetricData)))
names(bestMetricData) = themes
row.names(bestMetricData) = models

# What is the actual measure of the metric for the best models?
themeMetric = lapply(allKappaMatrix, apply, 2, max, na.rm=T)
themeMetric = do.call(rbind.data.frame, themeMetric)
names(themeMetric) = themes
row.names(themeMetric) = models  


# REDO THE PROCESS FOR DATA WITH SYNONYMS
matDataSyn = allKappaMatrixSyn

# Identify which datasets made the best results for each measure and model
otmlist = list()
for (k in 1:length(matDataSyn)) {
  otm = matDataSyn[[k]]
  otm[is.na(otm)] = 0
  otmlist[[k]] = apply(otm, 2, which.max)
}

themeMetricDataSyn = do.call(rbind.data.frame, otmlist)
names(themeMetricDataSyn) = themes
row.names(themeMetricDataSyn) = paste0(models, "Syn")  

# themeMetricSyn finds the indices for the data; now, identify the name of the data
bestMetricSyn = dsets[unlist(themeMetricDataSyn)]
bestMetricSyn = paste0(bestMetricSyn, "WithSyns")
bestMetricSyn = as.data.frame(matrix(bestMetricSyn, 
                                     ncol=ncol(themeMetricDataSyn), nrow=nrow(themeMetricDataSyn)))
names(bestMetricSyn) = themes
row.names(bestMetricSyn) = models

# What is the actual measure of the metric for the best models?
themeMetricSyn = lapply(matDataSyn, apply, 2, max, na.rm=T)
themeMetricSyn = do.call(rbind.data.frame, themeMetricSyn)
names(themeMetricSyn) = themes
row.names(themeMetricSyn) = paste0(models, "Syn")  


# REDO THE PROCESS FOR *ONLY* SYNONYMS
matDataDict = allKappaMatrixDict

# Identify which datasets made the best results for each measure and model
otmlistD = list()
for (k in 1:length(matDataDict)) {
  otm = matDataDict[[k]]
  otm[is.na(otm)] = 0
  otmlistD[[k]] = apply(otm, 2, which.max)
}

themeMetricDataDict = do.call(rbind.data.frame, otmlistD)
names(themeMetricDataDict) = themes
row.names(themeMetricDataDict) = paste0(models, "OnlySyn")  

# themeMetricDict finds the indices for the data; now, identify the name of the data
bestMetricDict = "onlySyns"
bestMetricDict = as.data.frame(matrix(bestMetricDict, 
                                      ncol=ncol(themeMetricDataDict), nrow=nrow(themeMetricDataDict)))
names(bestMetricDict) = themes
row.names(bestMetricDict) = models

# What is the actual Kappa for the best models?
themeMetricDict = lapply(matDataDict, apply, 2, max, na.rm=T)
themeMetricDict = do.call(rbind.data.frame, themeMetricDict)
names(themeMetricDict) = themes
row.names(themeMetricDict) = paste0(models, "OnlySyn")  

# Put results from both sets of analyses together 
bestDatas = rbind(bestMetricData, bestMetricSyn, bestMetricDict)
bestDataNums = rbind(themeMetricData, themeMetricDataSyn, themeMetricDataDict)
bestMetrics = rbind(themeMetric, themeMetricSyn, themeMetricDict)

# Now, for each theme, find which model and data did best 
bm = apply(bestMetrics, 2, which.max)

bestDataName = bestDataNum = bestPredModel = NA
for (j in 1:length(bm)) {
  bestDataNum[j] = bestDataNums[bm[j], j]
  bestDataName[j] = bestDatas[bm[j],j]
  bpm = row.names(bestDatas)[bm[j]]
  bpm = gsub("1", "WithSyns", bpm)
  bpm = gsub("2", "OnlySyns", bpm)
  bestPredModel[j] = bpm
}
data.frame(themes, bestDataName, bestPredModel)


## Get the predictive data we need
names(allPredsList) = models
names(allPredsListSyn) = paste0(models, "WithSyns")
names(allPredsListDict) = paste0(models, "OnlySyns")
bothPredsList = c(allPredsList, allPredsListSyn, allPredsListDict)

names(allTabsList) = models
names(allTabsListSyn) = paste0(models, "WithSyns")
names(allTabsListDict) = paste0(models, "OnlySyns")
bothTabsList = c(allTabsList, allTabsListSyn, allTabsListDict)


# Pull the best predictions from everything
bestPredMatrix = data.frame()
for (i in 1:length(themes)) {
  onetheme = themes[i]
    
  indexModel = bestPredModel[i]
  indexDataNum = bestDataNum[i]
    
  bestPreds = bothPredsList[indexModel][[1]][[indexDataNum]][,c("entryID", onetheme)]
    
  if (i==1) {
    bestPredMatrix = bestPreds
  } else {
     bestPredMatrix = merge(bestPredMatrix, bestPreds, all.x=T)
  }
}
names(bestPredMatrix) = c("entryID", paste0("pred", themes))
  

#### Table A15: Confusion matrices for best performing models ####

bestTabMatrix = list()
for (i in 1:length(themes)) {
  onetheme = themes[i]
    
  indexModel = bestPredModel[i]
  indexDataNum = bestDataNum[i]
    
  bestTab = bothTabsList[indexModel][[1]][[indexDataNum]][i]
    
  bestTabMatrix[[i]] = bestTab
    
}
names(bestTabMatrix) = paste0("pred", themes)
bestTabMatrix


# Put the predictions and topics into the data
predsAndTopics = join(bestPredMatrix, topicData, by="entryID", "inner")

# Add in the synonym data
countsData = propData = matrix(NA, ncol=length(themes), nrow=nrow(pde))
for (i in 1:length(themes)) {
  syncount = synCounts[[which(names(synCounts)==themes[i])]]
  syncount = syncount %>% select(-c("entryID", "presAdmin", "Date"))
  thesyncount = apply(syncount, 1, sum)
  countsData[,i] = thesyncount
}
countsData = as.data.frame(countsData)
names(countsData) = paste0("count", themes)

countsData = data.frame(entryID=pde$entryID, countsData)

predsAndTopicsAndCounts = join(predsAndTopics, countsData, by="entryID", "inner")

### Create entry-country-level data
dd = join(pdc, predsAndTopicsAndCounts, by="entryID", type="left")
ddl = join(dd, leaderCount, by="entryID", "left") # Add in leader data

ddl = ddl |> dplyr::select(pdbID:nWords, ccode=theccode, leaderMention, leaderTenure,
                           predINF, predANIM, predBELLIG, predIRR,
                           countINF, countANIM, countBELLIG, countIRR, 
                           Topic1:Topic65)

write.csv(ddl, "./data/predictedData.csv", row.names=F)